import warnings
warnings.filterwarnings('ignore')
# !pip install mlxtend
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
# import os
# print(os.listdir("../input"))
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
# Any results you write to the current directory are saved as output.
df= pd.read_csv("Tweets.csv")
df.head()
pd.set_option('display.max_colwidth', None) # It will enable the entire row visible with truncation of the text. (We can see full text.)
df.loc[[0]]
print("Shape of the dataframe is",df.shape, "\n")
print("The number of nulls in each column are: \n",df.isna().sum())
print("Percentage null or na values in df")
((df.isnull() | df.isna()).sum() * 100 / df.index.size).round(2)
df.describe()
df.airline.unique()
print("Total number of tweets for each airline \n ",df.groupby('airline')['airline_sentiment'].count().sort_values(ascending=False))
# airlines= ['US Airways','United','American','Southwest','Delta','Virgin America']
airlines = df.airline.unique().tolist()
plt.figure(1,figsize=(12, 12))
for i in airlines:
indices= airlines.index(i)
plt.subplot(2,3,indices+1)
new_df=df[df['airline']==i]
count=new_df['airline_sentiment'].value_counts()
Index = [1,2,3]
plt.bar(Index,count, color=['red', 'green', 'blue'])
plt.xticks(Index,['negative','neutral','positive'])
plt.ylabel('Mood Count')
plt.xlabel('Mood')
plt.title('Count of Moods of '+i)
# pip install wordcloud
from wordcloud import WordCloud,STOPWORDS
new_df=df[df['airline_sentiment']=='negative']
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
new_df=df[df['airline_sentiment']=='positive']
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# Calculate highest frequency words in positive tweets
def freq(str):
# break the string into list of words
str = str.split()
str2 = []
# loop till string values present in list str
for i in str:
# checking for the duplicacy
if i not in str2:
# insert value in str2
str2.append(i)
for i in range(0, len(str2)):
if(str.count(str2[i])>50):
print('Frequency of', str2[i], 'is :', str.count(str2[i]))
print(freq(cleaned_word))
#get the number of negative reasons
df['negativereason'].nunique()
NR_Count=dict(df['negativereason'].value_counts(sort=False))
def NR_Count(Airline):
if Airline=='All':
a=df
else:
a=df[df['airline']==Airline]
count=dict(a['negativereason'].value_counts())
Unique_reason=list(df['negativereason'].unique())
Unique_reason=[x for x in Unique_reason if str(x) != 'nan']
Reason_frame=pd.DataFrame({'Reasons':Unique_reason})
Reason_frame['count']=Reason_frame['Reasons'].apply(lambda x: count[x])
return Reason_frame
def plot_reason(Airline):
a=NR_Count(Airline)
count=a['count']
Index = range(1,(len(a)+1))
plt.bar(Index,count, color=['red','yellow','blue','green','black','brown','gray','cyan','purple','orange'])
plt.xticks(Index,a['Reasons'],rotation=90)
plt.ylabel('Count')
plt.xlabel('Reason')
plt.title('Count of Reasons for '+Airline)
plot_reason('All')
plt.figure(2,figsize=(13, 13))
for i in airlines:
indices= airlines.index(i)
plt.subplot(2,3,indices+1)
plt.subplots_adjust(hspace=0.9)
plot_reason(i)
Our dataframe has data from 2015-02-17 to 2015-02-24
It will be interesting to see if the date has any effect on the sentiments of the tweets(especially negative !). We can draw various coclusions by visualizing this.
date = df.reset_index()
#convert the Date column to pandas datetime
date.tweet_created = pd.to_datetime(date.tweet_created)
#Reduce the dates in the date column to only the date and no time stamp using the 'dt.date' method
date.tweet_created = date.tweet_created.dt.date
date.tweet_created.head()
df = date
day_df = df.groupby(['tweet_created','airline','airline_sentiment']).size()
# day_df = day_df.reset_index()
day_df
This shows the sentiments of tweets for each date from 2015-02-17 to 2015-02-24 for every airline in our dataframe.
Our next step will be to plot this and get better visualization for negative tweets.
day_df = day_df.loc(axis=0)[:,:,'negative']
#groupby and plot data
ax2 = day_df.groupby(['tweet_created','airline']).sum().unstack().plot(kind = 'bar', color=['red', 'green', 'blue','yellow','purple','orange'], figsize = (15,6), rot = 70)
labels = ['American','Delta','Southwest','US Airways','United','Virgin America']
ax2.legend(labels = labels)
ax2.set_xlabel('Date')
ax2.set_ylabel('Negative Tweets')
plt.show()
df = df[['text', 'airline_sentiment']]
print("Shape of the dataframe is",df.shape, "\n")
df.head(5)
#
# Function to remove the contractions
#
import contractions
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
#
# update the stopwords to not have 'not', 'no' to keep these critical words that effects the meaning of the sentiments
# also add word 'pep' for the same reason
#
from nltk.corpus import stopwords
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.append('pep')
print (stopwords.words("english"))
#
# function to remove accented characters
#
import unicodedata
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
#
# function to remove special characters and optionally digits
#
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
#
# collection of functions that performs pre-processing on the tweet
# 1. Remove HTML tags using 'BeautifulSoup'
# 2. Tokenize using 'NLTK'
# 3. Replace contractions using 'contractions' library
# 4. Remove accented characters using unicodedata library
# 5. Remove special characters and digits using regex
# 6. convert all letters to lowercase
# 7. Remove stopwords
# 8. join the tokenized words to make it a text again
#
# On this cleaned data we will perform stemming and lemmatization
#
from bs4 import BeautifulSoup
def tweet_to_words( raw_review ):
#
# Function to convert a raw review to a string of words
# The input is a single string (a raw movie review), and
# the output is a single string (a preprocessed movie review)
#
#
# 1. Remove HTML
#
review_text = BeautifulSoup(raw_review).get_text()
#
# 2. Tokenize
#
words = nltk.word_tokenize(review_text)
#
# 3. Replace contractions
#
review_text = replace_contractions(review_text)
#
# 4. Remove accented characters
#
review_text = remove_accented_chars(review_text)
#
# 5. Remove non-letters (special characters)
# letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
letters_only = remove_special_characters(review_text, remove_digits = True)
#
# 6. Convert to lower case, split into individual words
#
words = letters_only.lower().split()
#
# 7. Remove stop words
# In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = stopwords.words("english")
stops.remove('no')
stops.remove('not')
stops.append('pep')
stops = set(stops)
meaningful_words = [w for w in words if not w in stops]
#
# 8. Join the words back into one string separated by space,
# and return the result.
#
return( " ".join( meaningful_words ))
#
# clean the 'text' column in a new column 'clean_tweet'
#
df['clean_tweet'] = df['text'].apply(lambda x: tweet_to_words(x))
#
# check the cleaned tweet in 'clean_tweet'
#
df.head(5)
#
# Functions to perform stemming
#
from nltk.stem import LancasterStemmer, WordNetLemmatizer
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = [] # Create empty list to store pre-processed words.
for word in words:
stem = stemmer.stem(word)
stems.append(stem) # Append processed words to new list.
return stems
def simple_stemmer(text):
ps = nltk.porter.PorterStemmer()
text = ' '.join([ps.stem(word) for word in text.split()])
return text
#
# Functions to perform lemmatization
#
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = [] # Create empty list to store pre-processed words.
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma) # Append processed words to new list.
return lemmas
import spacy
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
def lemmatize_text(text):
text = nlp(text) # encode to spacy format
# -PRON- => proper noun
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
#
# Have a single function to perform stemming and lemmatization
#
def stem_and_lemmatize(words):
stems = stem_words(words)
lemmas = lemmatize_verbs(words)
return stems, lemmas
def simple_stem_and_lemmatize(words):
stems = simple_stemmer(words)
lemmas = lemmatize_text(words)
return stems, lemmas
#
# Create columns for stemmed and lemmatized text. Stemming and Lemmatization are performed over the 'clean_tweet' text
#
df['lemma'] = ''
df['stem'] = ''
for i, row in df.iterrows():
words = df.at[i, 'clean_tweet']
# stems, lemmas = stem_and_lemmatize(words)
stems, lemmas = simple_stem_and_lemmatize(words)
df.at[i,'stem'] = stems
df.at[i, 'lemma'] = lemmas
#
# check the stemmed and lemmatized text
#
df.head()
#
# The data is split in the standard 80,20 ratio
#
train,test = train_test_split(df,test_size=0.2,random_state=42, stratify=df.airline_sentiment)
#
# Generate test and train data from the cleaned and 'stem' text
#
train_clean_tweet_stem=[]
for tweet in train['stem']:
train_clean_tweet_stem.append(tweet)
test_clean_tweet_stem=[]
for tweet in test['stem']:
test_clean_tweet_stem.append(tweet)
#
# Generate test and train data from the cleaned and 'lemma' text
#
train_clean_tweet_lemma=[]
for tweet in train['lemma']:
train_clean_tweet_lemma.append(tweet)
test_clean_tweet_lemma=[]
for tweet in test['lemma']:
test_clean_tweet_lemma.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
print ("Creating CountVectorized bag of words...\n")
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
v_cv = CountVectorizer(analyzer = "word")
train_features_cv = v_cv.fit_transform(train_clean_tweet_stem)
test_features_cv = v_cv.transform(test_clean_tweet_stem)
# summarize v_cd
# print(v_cv.vocabulary_)
print(train_features_cv.shape)
# print(type(train_features_cv))
# print(train_features_cv.toarray())
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
v_tfidf = TfidfVectorizer()
# tokenize and build vocab
train_features_tfidf = v_tfidf.fit_transform(train_clean_tweet_stem)
test_features_tfidf = v_tfidf.transform(test_clean_tweet_stem)
print(test_features_tfidf.shape)
df_result_mod_train = pd.DataFrame({'Classifier':[],'processed_data':[],'Vectorizer':[], 'Accuracy':[]})
ind = 0
Classifiers = [
DecisionTreeClassifier(),
RandomForestClassifier(n_jobs=-1,n_estimators=500),
GradientBoostingClassifier(n_estimators=500),
LogisticRegression(max_iter=500),
SVC(),
KNeighborsClassifier(n_neighbors=3),
AdaBoostClassifier(),
GaussianNB()]
from yellowbrick.classifier import ClassificationReport, ROCAUC
def visClassifierResults(model_w_parameters, X_train, y_train, X_test, y_test):
viz = ClassificationReport(model_w_parameters)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(model_w_parameters)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
def fit_classifier(train_data, test_data, col_name, vectorizer_name):
global ind
dense_features=train_data.toarray()
dense_test= test_data.toarray()
Accuracy=[]
Model=[]
for classifier in Classifiers:
try:
fit = classifier.fit(train_data,train['airline_sentiment'])
pred = fit.predict(test_data)
except Exception:
fit = classifier.fit(dense_features,train['airline_sentiment'])
pred = fit.predict(dense_test)
accuracy = accuracy_score(pred,test['airline_sentiment'])
Accuracy.append(accuracy)
Model.append(classifier.__class__.__name__)
print(classifier.__class__.__name__+' on cleaned '+col_name+' text with vectorizer: '+vectorizer_name)
print('==============================================================================================')
print('Accuracy of '+classifier.__class__.__name__+' on cleaned '+col_name+' text with vectorizer: '+vectorizer_name+' is '+str(accuracy))
df_result_mod_train.loc[ind] = [classifier.__class__.__name__,
col_name,
vectorizer_name,
accuracy]
ind = ind + 1
print(classification_report(pred,test['airline_sentiment']))
cm=confusion_matrix(pred , test['airline_sentiment'])
plt.figure()
plot_confusion_matrix(cm,figsize=(12,8), hide_ticks=True,cmap=plt.cm.Reds)
plt.xticks(range(3), ['Negative', 'Neutral', 'Positive'], fontsize=16,color='black')
plt.yticks(range(3), ['Negative', 'Neutral', 'Positive'], fontsize=16)
plt.show()
visClassifierResults(classifier, dense_features, train['airline_sentiment'], dense_test, test['airline_sentiment'])
def process_column(col):
train_clean_tweet=[]
for tweet in train[col]:
train_clean_tweet.append(tweet)
test_clean_tweet=[]
for tweet in test[col]:
test_clean_tweet.append(tweet)
print ("Creating CountVectorized bag of words...\n")
v_cv = CountVectorizer(analyzer = "word")
train_features_cv = v_cv.fit_transform(train_clean_tweet)
test_features_cv = v_cv.transform(test_clean_tweet)
print("CountVectorized shape: ", train_features_cv.shape)
# Fit the DTM generated using CountVectorizer
fit_classifier(train_features_cv, test_features_cv, col, 'CountVectorized')
v_tfidf = TfidfVectorizer()
# tokenize and build vocab
train_features_tfidf = v_tfidf.fit_transform(train_clean_tweet_stem)
test_features_tfidf = v_tfidf.transform(test_clean_tweet_stem)
print("TfIDF shape: ", test_features_tfidf.shape)
# Fit the DTM generated using TF-IDF
fit_classifier(train_features_tfidf, test_features_tfidf, col, 'TfidfVectorizer')
# Fit the DTM generated using TF-IDF
# fit_classifier(train_features_tfidf, test_features_tfidf)
process_column('stem')
df_result_mod_train.sort_values(by='Accuracy', ascending=False, ignore_index=True)
# Fit the DTM generated using CountVectorizer
# fit_classifier(train_features_cv, test_features_cv)
process_column('lemma')
# process_column('clean_tweet')
df_result_mod_train.sort_values(by='Accuracy', ascending=False, ignore_index=True)
# process_column('text')
df_result_mod_train[df_result_mod_train.processed_data == 'text'].sort_values(by='Accuracy', ascending=False, ignore_index=True)
df_result_mod_train.sort_values(by='Accuracy', ascending=False, ignore_index=True)
# TODO: BN
# Analysis from EDA
# Write about the text pre-processing
# Write about the vectorizer
# write about the classifiers performance i.e. accuracy, recall, precision, f1-score etc.
# note on performance of tf-idf v/s count vectorizer in terms of score and computer resources